Load packages
library(tidyverse)
library(gridExtra)
library(cowplot)
library(ggridges)
library(ggstance)
library(treeio)
library(ggtree)
library(tidytree)
Load and combine data
species <- read_csv("../data/species_data.csv") %>% select(-species)
data <- read_csv("../data/data.csv") %>%
left_join(species, by = c("species" = "species_formatted")) %>%
rename(label = species_latin)
data2 <- data %>% group_by(species, label) %>% summarise(n = sum(n))
fulltree <- read.nexus("../data/consensusTree_10kTrees_298Primates_V3.nex")
refs <- read_csv("../data/ref_nodes.csv")
# turn tree into tidy dataframe
tree2 <- as_tibble(fulltree)
tree3 <- tree2 %>%
left_join(data2) %>%
mutate(
hasN = ifelse(is.na(n), 0, .5),
hasN2 = ifelse(is.na(n), .1, .5)) %>%
left_join(refs) %>%
# # also merge w datasheet listing node, n for inner nodes
# left_join(Ns, by = "node") %>%
# mutate(n = coalesce(n.x, n.y)) %>%
# select(-n.x, -n.y) %>%
groupClade(c(493, 496, 429, 302, 408)) %>%
mutate(group = fct_recode(group, "2" = "1"))
Joining, by = "label"
Joining, by = "node"
# turn back into tree
tree4 <- as.treedata(tree3)
This makes a rectangular and a circular tree with the node numbers displayed for reference (saved in the graphs folder).
tree3.2 <- as.treedata(tree3)
# display node numbers for reference
ggtree(tree3.2) +
# tip labels
geom_tippoint(aes(size = n), col = "seagreen", alpha = .5) +
geom_tiplab(offset = 1, size = 3) +
# node labels
geom_text(aes(label = node, x = branch), size = 2, col = "blue", vjust = -.5) +
expand_limits(x = 90) +
# display timescale at the bottom
theme_tree2()
ggsave("../graphs/full_tree_nodes.pdf", width = 8, height = 20, scale = 2)
ggtree(tree3.2, layout = "circular") +
geom_tippoint(aes(size = n), col = "seagreen", alpha = .5) +
geom_tiplab2(offset = 2, size = 3) +
geom_text2(aes(label = node), size = 1.5, col = "blue") +
xlim(NA, 100)
ggsave("../graphs/full_tree_nodes_circular.pdf", width = 8, height = 8, scale = 2)
cols <- viridis::viridis(4, end = .9)
# base plot
p <- ggtree(tree4, aes(size = hasN, alpha = hasN2), layout = "circular") +
# root
geom_rootpoint(size = 1) +
# tips
geom_tippoint(aes(size = n), alpha = .5) +
geom_tiplab2(aes(alpha = hasN), offset = 2, size = 3) +
# tweak scales
scale_alpha_continuous(range = c(.3, 1)) +
scale_size_continuous(range = c(.5, 8)) +
# widen plotting area
xlim(NA, 100)
# highlight clades with background colors
p2 <- p +
geom_hilight(node = 493, fill = cols[1], alpha = .3) +
geom_hilight(node = 496, fill = cols[1], alpha = .3) +
geom_hilight(node = 429, fill = cols[2], alpha = .3) +
geom_hilight(node = 303, fill = cols[3], alpha = .3) +
geom_hilight(node = 408, fill = cols[4], alpha = .3) +
# plot tree again to be on top of the highlights
geom_tree() +
geom_rootpoint(size = 1)
# highlight clades with branch colors
p3 <- p +
aes(col = group) +
scale_color_manual(values = c("gray30", cols))
plots <- mget(c("p", "p2", "p3"))
grid.arrange(p, p2, p3, nrow = 1)
# png with 3x1
ggsave("../graphs/phylo_full.png", arrangeGrob(grobs = plots, nrow = 1),
width = 24, height = 8, scale = 2, dpi = 72)
Removed 287 rows containing missing values (geom_point_g_gtree).Removed 287 rows containing missing values (geom_point_g_gtree).Removed 287 rows containing missing values (geom_point_g_gtree).
# pdf with 1 per page
ggsave("../graphs/phylo_full.pdf", marrangeGrob(grobs = plots, nrow = 1, ncol = 1),
width = 8, height = 8, scale = 2, dpi = 72)
Removed 287 rows containing missing values (geom_point_g_gtree).Removed 287 rows containing missing values (geom_point_g_gtree).Removed 287 rows containing missing values (geom_point_g_gtree).
# subset tree to just those species who have sample sizes reported, i.e. those who were tested
to_drop <- tree3 %>% filter(is.na(n)) %>% pull(label)
tree5 <- drop.tip(tree4, to_drop)
data3 <- data %>%
select(label, everything()) %>%
rename(num = n)
# species with more than X sites can get a density?
d3a <- data3 %>% group_by(species) %>% filter(n_distinct(site) >= 4)
d3b <- data3 %>% # setdiff(data3, d3a) ## <- do setdiff instead to NOT show points for densities
group_by(species) %>%
# create variable num2 is NA if there's only one data point for a species
# --> those species will only get the vertical crossbar
mutate(flag = n_distinct(site) == 1) %>%
ungroup %>%
mutate(num2 = ifelse(flag, NA, num))
# for vertical crossbar = median
d4a <- data3 %>%
group_by(label, species) %>%
summarise(Mdn = median(num)) # totalN = sum(num), sitesN = n_distinct(site)
# for cross = median (sample size by species and site)
d4b <- data3 %>%
group_by(label, species, site) %>%
summarise(Mdn_site = median(num))%>%
group_by(label, species) %>%
summarise(Mdn_site2 = median(Mdn_site))
# make NA when equal to d4$Mdn
d4 <- full_join(d4a, d4b) %>%
mutate(Mdn_site2 = ifelse(Mdn == Mdn_site2, NA, Mdn_site2))
Joining, by = c("label", "species")
# for vertical line in ridge plot (grand median)
# + hacky way to make horizontal grid lines for right panel only
v <- tibble(reference = c(NA, median(data3$num)), .panel = c("Tree", "xSample size"))
h <- tibble(reference = c(NA, 1:Ntip(tree5)), .panel = c("Tree", rep("xSample size", Ntip(tree5))))
# for axis labels
ax <- tibble(lab = c("Distance", "Sample size"), x = c(67.5, max(data3$num)/2), y = -.3,
.panel = c("Tree", "xSample size"))
# right-side viz depends on the number of sites per species:
# 1 site = vertical crossbar only
# 2+ sites = points + crossbar at median
# X+ sites = densities (currently, X = 4 just to illustrate)
# LEFT FACET
q <- ggtree(tree5, aes(col = group)) +
# root
geom_rootedge(rootedge = 5) +
# tip labels
geom_tippoint(aes(size = n), alpha = .5) +
geom_tiplab(offset = 4, size = 3) +
# tweak scales
scale_color_manual(values = c("grey30", cols)) +
scale_fill_manual(values = cols[4]) + # when all categories are taken: cols
# display timescale at the bottom
theme_tree2() +
xlim_tree(135) +
# add axis labels
geom_text(data = ax, aes(label = lab), col = "black") +
scale_y_continuous(limits = c(1, Ntip(tree5)), oob = function(x, ...) x) +
coord_cartesian(clip = "off") +
# add reference lines (these will show up on right panel of facet_plot only)
geom_hline(data = h, aes(yintercept = reference), lwd = .2, col = "grey", alpha = .5) +
geom_vline(data = v, aes(xintercept = reference), lwd = 1.5, col = "grey", alpha = .3) +
# remove facet strips, expand bottom margin (to make space for x axis labels)
theme(strip.text = element_blank(), strip.background = element_blank(),
plot.margin = unit(c(.5, .5, 1.2, .5), "cm"))
# dirty hack: x in front of "Sample size" is to have that panel sort to the right (alphabetically) until I figure out why it doesn't just go by order. This cropped up as an issue when I added the dummy point for the x-axis expansion...
# ADD RIGHT FACET
q %>%
# densities for species with enough sites
facet_plot("xSample size", d3a, geom_density_ridges, aes(x = num, group = label, fill = group),
alpha = .5, lwd = .3, position = position_nudge(y = .1)) %>%
# vertical crossbar for Mdn, for Mdn of site medians
facet_plot("xSample size", d4, geom_crossbarh, aes(x = Mdn, xmin = Mdn, xmax = Mdn, group = label,
col = group), alpha = .5, width = .3) %>%
facet_plot("xSample size", d4, geom_point, aes(x = Mdn_site2, group = label), shape = 4, size = 2,
stroke = 1.5, alpha = .8) %>%
# vertical mark for individual sites
facet_plot("xSample size", d3b, geom_jitter, aes(x = num2, group = label), shape = "|", size = 3,
width = .1, height = 0)
ggsave("../graphs/phylo_ridge_site.png", width = 4, height = 3, scale = 2)
sessionInfo()
R version 3.6.1 (2019-07-05)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Mojave 10.14.5
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] tidytree_0.2.8 ggtree_1.16.6 treeio_1.8.2 ggstance_0.3.3 ggridges_0.5.1
[6] cowplot_1.0.0 gridExtra_2.3 forcats_0.4.0 stringr_1.4.0 dplyr_0.8.3
[11] purrr_0.3.2 readr_1.3.1 tidyr_1.0.0 tibble_2.1.3 ggplot2_3.2.1
[16] tidyverse_1.2.1
loaded via a namespace (and not attached):
[1] Rcpp_1.0.2 lubridate_1.7.4 ape_5.3 lattice_0.20-38 assertthat_0.2.1
[6] zeallot_0.1.0 digest_0.6.21 R6_2.4.0 cellranger_1.1.0 plyr_1.8.4
[11] backports_1.1.5 evaluate_0.14 httr_1.4.1 pillar_1.4.2 rlang_0.4.0
[16] lazyeval_0.2.2 readxl_1.3.1 rstudioapi_0.10 rmarkdown_1.16 labeling_0.3
[21] munsell_0.5.0 broom_0.5.2 compiler_3.6.1 modelr_0.1.5 xfun_0.10
[26] pkgconfig_2.0.3 base64enc_0.1-3 htmltools_0.4.0 tidyselect_0.2.5 viridisLite_0.3.0
[31] crayon_1.3.4 withr_2.1.2 grid_3.6.1 nlme_3.1-141 jsonlite_1.6
[36] gtable_0.3.0 lifecycle_0.1.0 magrittr_1.5 scales_1.0.0 cli_1.1.0
[41] stringi_1.4.3 reshape2_1.4.3 viridis_0.5.1 xml2_1.2.2 rvcheck_0.1.5
[46] vctrs_0.2.0 generics_0.0.2 tools_3.6.1 glue_1.3.1 hms_0.5.1
[51] parallel_3.6.1 yaml_2.2.0 colorspace_1.4-1 BiocManager_1.30.7 rvest_0.3.4
[56] knitr_1.25 haven_2.1.1